library(dplyr)
library(plotly)
library(reshape2)
library(lubridate)
library(randomForestSRC)
One dimensional Partial Dependence Plot
days <- read.csv("day.csv")
hour <- read.csv("hour.csv")
days$dteday <- as_date(days$dteday)
days_select <- select(days, workingday, holiday, temp, hum, windspeed, cnt)
days_select$days_2011 <- int_length(interval(ymd("2011-01-01"), days$dteday)) / (3600*24)
days_select$Winter <- ifelse(days$season == 1, 1, 0)
days_select$Fall <- ifelse(days$season == 4, 1, 0)
days_select$Summer <- ifelse(days$season == 3, 1, 0)
days_select$Misty <- ifelse(days$weathersit == 2, 1, 0)
days_select$Rain <- ifelse(days$weathersit == 3 | days$weathersit == 4, 1, 0)
days_select$Temp <- days_select$temp * 47 - 8
days_select$Hum <- days_select$hum * 100
days_select$Windspeed <- days_select$windspeed * 67
rfsrc <- rfsrc(cnt~., data=days_select) # Creamos la función random forest
res <- select(days_select, days_2011, Temp, Hum, Windspeed, cnt)
rows <- nrow(days_select)
for(c in names(res)[1:4])
{
for(i in 1:rows){
resul <- days_select
resul[[c]] <- days_select[[c]][i]
pred <- predict(rfsrc, resul)$predicted
res[[c]][i] <- sum(pred) / rows
}
}
figure1 = ggplot(days_select, aes(x = Temp, y = res$Temp)) + ylim(0, 6000) + geom_line() + geom_rug(sides="b", alpha = 0.5) + labs(x = "Temperature")
figure2 = ggplot(days_select, aes(x = Hum, y = res$Hum)) + ylim(0, 6000) + geom_line() + geom_rug(sides="b", alpha = 0.5) + labs(x = "Humidity")
figure3 = ggplot(days_select, aes(x = Windspeed, y = res$Windspeed)) + ylim(0, 6000) + geom_line() + geom_rug(sides="b", alpha = 0.5) + labs(x = "Windspeed")
figure4 = ggplot(days_select, aes(x = days_2011, y = res$days_2011)) + ylim(0, 6000) + geom_line() + geom_rug(sides="b", alpha = 0.5) + labs(x = "Days since 2011", y = "Predictions")
subplot(figure1, figure2, figure3,figure4, titleX = TRUE, titleY = TRUE, shareX = FALSE, shareY = TRUE)
2. Bidimensional Partial Dependency Plot
selection <- sample_n(days_select, 40)
Temp <- selection$Temp
Hum <- selection$Hum
Temp_Hum <- inner_join(data.frame(Temp),data.frame(Hum), by=character())
Temp_Hum$p <- 0
for(i in 1:nrow(Temp_Hum)){
resul <- days_select
resul[["Temp"]] <- Temp_Hum[["Temp"]][i]
resul[["Hum"]] <- Temp_Hum[["Hum"]][i]
pred <- predict(rfsrc, resul)$predicted
Temp_Hum[["p"]][i] <- sum(pred) / rows
}
figure5 = ggplot(Temp_Hum, aes(x = Temp, y = Hum, fill = p)) + geom_tile(width = 10, height = 15) + geom_rug(alpha = 0.5) + labs(x = "Temperature", y = "Humidity", fill = "Num bikes")
figure5

3. PDP to explain the price of a house
house_data <- read.csv("kc_house_data.csv")
selection <- sample_n(house_data, 1000)
selection <- select(selection, bedrooms, bathrooms, sqft_living, sqft_lot, floors, yr_built, price)
rfsrc1 <- rfsrc(price~., data=selection)
res <- select(selection, bedrooms, bathrooms, sqft_living, floors, price)
rows <- nrow(selection)
for(name in names(res)[1:4])
{
for(i in 1:rows){
resul <- selection
resul[[name]] <- selection[[name]][i]
predic <- predict(rfsrc1, resul)$predicted
res[[name]][i] <- sum(predic) / rows
}
}
figure6 = ggplot(selection, aes(x = bedrooms,y = res$bedrooms)) + geom_line() + geom_rug(sides = "b", alpha = 0.5) + labs(x = "Bedrooms",y = "Prediction")
figure7 = ggplot(selection,aes(x=bathrooms,y=res$bathrooms)) + geom_line() + geom_rug(sides = "b", alpha = 0.5) + labs(x = "Bathrooms", y = "")
figure8 = ggplot(selection,aes(x=sqft_living,y=res$sqft_living)) + geom_line() + geom_rug(sides = "b", alpha = 0.5) + labs(x = "Sqft Living", y = "")
figure9 = ggplot(selection,aes(x=floors,y=res$floors)) + geom_line() + geom_rug(sides = "b", alpha = 0.5) + labs(x = "Floors", y = "")
subplot(figure6, figure7, figure8, figure9, titleX = TRUE, titleY = TRUE, shareX = FALSE, shareY = FALSE)